# Data cleaning 
# Article: Misinformation among Migrants
## Author: Daniel Rojas


rm(list = ls())

# set working director to source file location

# Function to install or load packages
install_and_load = function(packages) {
    for (pkg in packages) {
        if (!requireNamespace(pkg, quietly = TRUE)) {
            install.packages(pkg, dependencies = TRUE)
        }
        library(pkg, character.only = TRUE)
    }
}

# Install/load packages
install_and_load(c('tidyverse',
                   'haven'))

# set working directory to source 
#load raw data
dt_text = read.csv('Misinformation_COL_September+3,+2024_05.48(1).csv')[-c(1:2),] 
dt = read.csv('Misinformation_COL_September+3,+2024_05.48.csv')[-c(1:2),] 
dt = dt %>% 
    mutate(StartDate =  as.Date(StartDate, format = "%Y-%m-%d %H:%M:%S"))

# select valid respondents
dt = dt[as.Date(dt$StartDate) > as.Date("2024-07-26"), ] %>% 
    filter(consent==1 
           & adult==1
           & country!=1
           & colombia==1
           & temporal_status!=0
           & str_detect(email, '@')==1) 

# remove repeated respondents & email info
dt = dt %>% 
    filter(!duplicated(email)) %>% 
    select(-c('email'))

# select and clean variables of interest (character variables)
dt_text = dt_text[as.Date(dt$StartDate) > as.Date("2024-07-26"), ] %>% 
    filter(str_detect(email, '@')==1) %>% 
    transmute(ResponseId,
              age = 2024 - as.numeric(age_1),
              origin = case_when(country=='Otro, ¿cuál?'~'Other',
                                 country=='Haití'~'Haiti',
                                 country=='México'~'Mexico',
                                 country=='Perú'~'Peru',
                                 TRUE ~ country),
              education_descriptive = case_when(education=='Ninguno'~'None',
                                                education=='Primaria'~'Complete primary',
                                                education=='Primaria Incompleta'~'Incomplete primary',
                                                education=='Secundaria Incompleta'~'Incomplete secondary',
                                                education=='Secundaria'~'Complete secondary',
                                                education=='Universidad Incompleta'~'Incomplete college',
                                                education=='Universidad'~'Complete college',
                                                education=='Posgrado'~'Postgraduate degree',
                                                TRUE ~ NA_character_),
              race_descriptive = case_when(race=='Blanco' ~ 'White',
                                           race=='Indígena' ~ 'Indigeneous',
                                           race=='Afrodescendiente' ~ 'Afro',
                                           race=='No sabe' ~ 'DK',
                                           race=='Otra, ¿cuál?'~'Other',
                                           TRUE ~ race),
              religion_descriptive = case_when(religion=='Católico'~'Catholic',
                                               religion=='Protestante'
                                               |religion=='Evangélica o pentecostal'~'Protestant',
                                               religion=='Agnóstico o ateo'
                                               |religion=='Ninguna'~'No religion',
                                               TRUE ~ 'Other'),
              marital_status_descriptive = case_when(marital_status=='Casado'~'Married',
                                                     marital_status=='Unión libre'~'Free union',
                                                     marital_status=='Soltero'~'Single',
                                                     marital_status=='Divorciado'
                                                     |marital_status=='Separado'~'Separated',
                                                     marital_status=='Viudo'~'Widow',
                                                     TRUE ~ NA_character_),
              destination_descriptive = case_when(destination=='Canadá'~'Canada',
                                                  destination=='España'~'Spain',
                                                  destination=='Estados Unidos'~'USA',
                                                  destination_4_TEXT %in% c('Regresar a Venezuela ',
                                                                            'Venezuela',
                                                                            'Venezuela ',
                                                                            'Venezuela otra vez',
                                                                            'Vene9',
                                                                            'Venezu6',
                                                                            'Regresaría a Venezuela')~'Latin America',
                                                  destination_4_TEXT %in% c('Panamá',
                                                                            'Brasil',
                                                                            'Brasil ',
                                                                            'Brazil ',
                                                                            'Costa Rica',
                                                                            'Costa Rica ',
                                                                            'Mexico',
                                                                            'México',
                                                                            'México ',
                                                                            'República Dominicana ',
                                                                            'Peru',
                                                                            'Lima Perú ')~'Latin America',
                                                  destination=='No sé'~'DK',
                                                  TRUE ~ 'Other'))

# merge datasets
dt = merge(dt, dt_text, all.x = T)

# cleaning
variables_num = dt %>% select(starts_with('misinfo_share'),
                              robbery0,robbery1,
                              damage0,damage1,
                              extorsion0,extorsion1,
                              children,female,religion,marital_status,income,
                              value_democracy,trust_people,
                              motivated_reason_1_1,motivated_reason_2_1,
                              crime,crime_online,
                              trust_newspaper,trust_gov,trust_fb,
                              trust_tt,trust_twitter,trust_wa,
                              timing_m_treatment_Page.Submit) %>% colnames

dt = dt %>% 
    mutate(across(all_of(variables_num), as.numeric), 
           #demographics
           female = ifelse(female==1, 1,0),
           catholic = ifelse(religion==1, 1,0),
           married = ifelse(marital_status==2, 1,0),
           single = ifelse(marital_status==1, 1,0),
           kids_above_mean = ifelse(as.numeric(children) > mean(as.numeric(children)), 1,0),
           children_num = as.numeric(children),
           children_binary = ifelse(children_num==0, 0,1),
           destination_US = ifelse(destination_descriptive=='USA', 1,0),
           education_num = case_when(education_descriptive=='None'~0,
                                     education_descriptive=='Incomplete primary'~1,
                                     education_descriptive=='Complete primary'~2,
                                     education_descriptive=='Incomplete secondary'~3,
                                     education_descriptive=='Complete secondary'~4,
                                     education_descriptive=='Incomplete college'~5,
                                     education_descriptive=='Complete college'~6,
                                     education_descriptive=='Postgraduate degree'~7,
                                       TRUE ~ NA_real_),
           low_education = ifelse(education_num<3, 1,0), # up to elementary
           highly_educated = ifelse(education_num>5, 1,0), # complete college or postgrad
           # covariates
           value_democracy_binary = ifelse(value_democracy>2, 1,0), 
           trust_people_binary = ifelse(trust_people>2, 1,0), 
           motivated_reasoning_arrival = motivated_reason_1_1,
           motivated_reasoning_arrival_binary = ifelse(motivated_reasoning_arrival>mean(motivated_reasoning_arrival), 1,0),
           motivated_reasoning_asylum = motivated_reason_2_1,
           motivated_reasoning_asylum_binary = ifelse(motivated_reasoning_asylum>mean(motivated_reasoning_asylum), 1,0),
           motivated_reasoning_index = rowMeans(cbind(motivated_reasoning_arrival,
                                                      motivated_reasoning_asylum)),
           motivated_reasoning_binary = ifelse(motivated_reasoning_index>mean(motivated_reasoning_index), 1,0),
           info_outlet_newspapers = ifelse(str_detect(info_outlet, '1')==T, 1,0),
           info_outlet_gov = ifelse(str_detect(info_outlet, '2')==T, 1,0),
           info_outlet_fb = ifelse(str_detect(info_outlet, '3')==T, 1,0),
           info_outlet_tiktok = ifelse(str_detect(info_outlet, '4')==T, 1,0),
           info_outlet_twitter = ifelse(str_detect(info_outlet, '5')==T, 1,0),
           info_outlet_whatsapp = ifelse(str_detect(info_outlet, '6')==T, 1,0),
           info_outlet_all = case_when(str_detect(info_outlet, '1')==T ~ 'Newspaper',
                                       str_detect(info_outlet, '2')==T ~ 'Gov. Websites',
                                       str_detect(info_outlet, '3')==T ~ 'Facebook',
                                       str_detect(info_outlet, '4')==T ~ 'TikTok',
                                       str_detect(info_outlet, '5')==T ~ 'Twitter/X',
                                       str_detect(info_outlet, '6')==T ~ 'WhatsApp',
                                       TRUE ~ NA_character_), 
           info_source_Col = ifelse(str_detect(info_source, '1')==T, 1,0),
           info_source_US = ifelse(str_detect(info_source, '2')==T, 1,0),
           info_source_ngo = ifelse(str_detect(info_source, '3')==T, 1,0),
           info_source_family = ifelse(str_detect(info_source, '4')==T, 1,0),
           info_source_acq = ifelse(str_detect(info_source, '5')==T, 1,0),
           info_source_MigCol = ifelse(str_detect(info_source, '6')==T, 1,0),
           info_source_all = case_when(str_detect(info_source, '1')==T ~ 'Colombian Gov.',
                                       str_detect(info_source, '2')==T ~ 'US Gov.',
                                       str_detect(info_source, '3')==T ~ 'NGOs',
                                       str_detect(info_source, '4')==T ~ 'Family',
                                       str_detect(info_source, '5')==T ~ 'Known people',
                                       str_detect(info_source, '6')==T ~ 'Colombian Gov.',
                                       TRUE ~ NA_character_),
           crime_journey = crime,
           crime_journey_online = ifelse(crime_online==1, 1,0),
           # checks
           attention1 = ifelse(attention_check1==3, 1,0),
           attention2 = ifelse(attention_check2==3, 1,0),
           attention_1_2 = ifelse(attention_check1==3 & attention_check2==3, 1,0),
           attention_posttreatment = ifelse(attention_check3==3, 1,0),
           # experiment
           inperson_crime = rowSums(cbind(robbery0,
                                          damage0,
                                          extorsion0))/3,
           online_crime = rowSums(cbind(robbery1,
                                        damage1,
                                        extorsion1))/3,
           online_priming = ifelse(!is.na(robbery1), 1,0),
           crime_exposure = coalesce(inperson_crime, online_crime),
           treatment = ifelse(str_detect(Treatment_misinformation_DO, 't0')==T, 0,1),
           # outcomes
           trust_newspaper_binary = ifelse(trust_newspaper>2, 1,0),
           trust_gov_binary = ifelse(trust_gov>2, 1,0),
           trust_fb_binary = ifelse(trust_fb>2, 1,0),
           trust_tt_binary = ifelse(trust_tt>2, 1,0),
           trust_twitter_binary = ifelse(trust_twitter>2, 1,0),
           trust_wa_binary = ifelse(trust_wa>2, 1,0),
           trust_sources_index = rowMeans(cbind(trust_newspaper,trust_gov,
                                                trust_fb,trust_tt,trust_twitter,trust_wa)),
           trust_sources_index_binary = rowMeans(cbind(trust_newspaper_binary,
                                                       trust_gov_binary,
                                                       trust_fb_binary,
                                                       trust_tt_binary,
                                                       trust_twitter_binary,
                                                       trust_wa_binary)),
           dissemination_f = rowMeans(cbind(misinfo_share2_e,
                                           misinfo_share3_e)),
           dissemination_r = rowMeans(cbind(misinfo_share1_e,
                                           misinfo_share4_e,
                                           misinfo_share5_e)),
           sharing_discernment = dissemination_r - dissemination_f)


# save dt
saveRDS(dt, 'clean_data_COL.rds')














